import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# 增强的数据集：更多的垃圾邮件与正常邮件样本
X = [
    "Congratulations! You've won a $1000 gift card. Claim it now!",
    "Dear friend, I hope you are doing well. Let's catch up soon.",
    "Urgent: Your bank account has been compromised. Please contact support immediately.",
    "Hello, just wanted to confirm our meeting at 2 PM today.",
    "You have a new message from your friend. Click here to read.",
    "Get a free iPhone now! Limited offer, click here.",
    "Last chance to claim your prize, you won $500!",
    "Meeting scheduled for tomorrow. Please confirm.",
    "Hello! You are invited to an exclusive event!",
    "Click here to get free lottery tickets. Hurry up!",
    "Reminder: Your subscription will expire soon, renew now.",
    "Don't forget to submit your report by end of day today."
]
y = [1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0]  # 1 为垃圾邮件，0 为正常邮件

# 使用 TfidfVectorizer 进行文本向量化
vectorizer = TfidfVectorizer(stop_words='english')  # 去除停用词
X_vec = vectorizer.fit_transform(X).toarray()

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.33, random_state=42)


# 定义逻辑回归模型
class LogisticRegressionModel(nn.Module):
    def __init__(self, input_dim):
        super(LogisticRegressionModel, self).__init__()
        self.fc = nn.Linear(input_dim, 1)  # 线性层，输入维度是特征的数量，输出是1

    def forward(self, x):
        return torch.sigmoid(self.fc(x))  # 使用sigmoid激活函数输出0到1之间的概率


# 定义训练过程
def train_model(model, X_train, y_train, num_epochs=200, learning_rate=0.001):
    criterion = nn.BCELoss()  # 二分类交叉熵损失
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)  # 使用Adam优化器

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)

    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')


# 测试模型
def evaluate_model(model, X_test, y_test):
    model.eval()
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

    with torch.no_grad():
        outputs = model(X_test_tensor)
        predictions = (outputs >= 0.5).float()  # 阈值设为0.5
        accuracy = accuracy_score(y_test, predictions.numpy())
        print(f'Accuracy: {accuracy * 100:.2f}%')


# 训练并评估模型
input_dim = X_train.shape[1]  # 输入特征的数量
model = LogisticRegressionModel(input_dim)
train_model(model, X_train, y_train, num_epochs=200, learning_rate=0.001)
evaluate_model(model, X_test, y_test)


# 预测新邮件
def predict(model, new_email):
    model.eval()
    new_email_vec = vectorizer.transform([new_email]).toarray()
    new_email_tensor = torch.tensor(new_email_vec, dtype=torch.float32)
    with torch.no_grad():
        prediction = model(new_email_tensor)
        return "Spam" if prediction >= 0.5 else "Not Spam"


# 检测新邮件
email_1 = "Congratulations! You have a limited time offer for a free cruise."
email_2 = "Hi, let's discuss the project updates tomorrow."

print(f"Email 1: {predict(model, email_1)}")  # 可能输出：Spam
print(f"Email 2: {predict(model, email_2)}")  # 可能输出：Not Spam
